Chapter 4: Matrix Applications to Regression Models

In [ ]:
#If using Colab, you can only pull files from your Google Drive
#Run the following code to give Colab access
from google.colab import drive
drive.mount('/content/drive')

#File paths should then look like this:
file_path = '/content/drive/MyDrive/LinAlg/data/file.png'
#As opposed to your computer's directory, which might look like this:
file_path = '/Users/yourname/LinAlg/data/file.png'
In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import t, kendalltau
from statsmodels.graphics.gofplots import qqplot
from statsmodels.stats.stattools import durbin_watson
from scipy.stats import ks_2samp, norm
from statsmodels.tsa.stattools import acf
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive

Colorado temperature lapse rate & analysis (rerun)

In [ ]:
#Fig 4.1
x = np.array([
    1671.5, 1635.6, 2097.0, 1295.4, 1822.7, 2396.9, 2763.0, 1284.7,
    1525.2, 1328.6, 1378.9, 2323.8, 2757.8, 1033.3, 1105.5, 1185.7,
    2343.9, 1764.5, 1271.0, 2347.3, 2094.0, 2643.2, 1837.9, 1121.7])
y = np.array([
    22.064, 23.591, 18.464, 23.995, 20.645, 17.175, 13.582, 24.635,
    22.178, 24.002, 23.952, 16.613, 13.588, 25.645, 25.625, 25.828,
    17.626, 22.433, 24.539, 17.364, 17.327, 15.413, 22.174, 24.549])

#Linear regression
X = sm.add_constant(x)
model = sm.OLS(y, X).fit()
print(model.summary())

plt.figure(figsize=(10, 6))
plt.scatter(x, y)
plt.plot(x, model.predict(X), color='black', linewidth=3)
plt.xlabel("Elevation [m]", fontsize=14)
plt.ylabel("Temperature [°C]", fontsize=14)
plt.title("Colorado Elevation and July Tmean: 1981-2010 Average", fontsize=14)
plt.text(2100, 25.5, "Temperature lapse rate: 7.0 °C/km", fontsize=12)
plt.text(2350, 24, "y = 33.48 - 0.0070 x", fontsize=12)
plt.text(2350, 22.5, "R-squared = 0.96", fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()

#Colorado TLR regression analysis
residuals = model.resid
print("Rounded residuals:", np.round(residuals, 5))
print("Mean of residuals:", np.mean(residuals))

xa = x - np.mean(x)
print("Sum of xa * residuals:", np.sum(xa * residuals))
print("Residual variance:", np.sum(residuals**2) / (len(y) - 2))

#Fig 4.2
fig, ax = plt.subplots(figsize=(8, 4))

#Hide axes
ax.axis('off')
ax.set_xlim(0, 5.2)
ax.set_ylim(0, 2.2)

ax.arrow(0, 0, 4, 0, head_width=0.05, head_length=0.15, fc='black', ec='black', linewidth=3)
ax.arrow(4, 0, 0, 2, head_width=0.05, head_length=0.15, fc='black', ec='black', linewidth=3)
ax.arrow(0, 0, 4, 2, head_width=0.05, head_length=0.15, fc='black', ec='black', linewidth=3)

ax.arrow(0, 0, 5, 0, linestyle='dashed', linewidth=2, color='gray', length_includes_head=True)
ax.arrow(5, 0, -1, 2, linestyle='dashed', linewidth=2, color='gray', length_includes_head=True)
ax.arrow(3, 0, 1, 2, linestyle='dashed', linewidth=2, color='gray', length_includes_head=True)
ax.arrow(0, 0, 3, 0, linestyle='dashed', linewidth=2, color='gray', length_includes_head=True)

ax.plot([3.9, 3.9], [0, 0.1], color='black')
ax.plot([3.9, 4.0], [0.1, 0.1], color='black')

ax.text(2, 0.2, r'$\hat{b}\mathbf{x}_a$', fontsize=16)
ax.text(2, 1.2, r'$\mathbf{y}_a$', fontsize=16)
ax.text(4.1, 1.0, r'$\mathbf{e}$', fontsize=16)
ax.text(3.8, 0.6, "Shortest $\mathbf{e}$", fontsize=13, rotation=90)
ax.text(3.4, 1.1, "Longer $\mathbf{e}$", fontsize=13, rotation=71)
ax.text(4.6, 1.1, "Longer $\mathbf{e}$", fontsize=13, rotation=-71)

plt.tight_layout()
plt.show()

#Estimating regression slope b with 2 methods

#Using projection method
nxa = np.sqrt(np.sum(xa**2))
ya = y - np.mean(y)
b_proj = np.sum(ya * (xa / nxa)) / nxa
print("Slope (projection method):", b_proj)

#Using correlation
corxy = np.corrcoef(xa, ya)[0, 1]
nya = np.sqrt(np.sum(ya**2))
b_corr = corxy * nya / nxa
print("Slope (correlation method):", b_corr)

#R-squared verification
yhat = model.fittedvalues
print("Variance of fitted values:", np.var(yhat, ddof=1))
print("Variance of y:", np.var(y, ddof=1))
print("R-squared:", np.var(yhat, ddof=1) / np.var(y, ddof=1))
print("Correlation^2:", np.corrcoef(x, y)[0, 1] ** 2)

#Confidence intervals
x1 = np.linspace(max(x), min(x), 100)
X1 = sm.add_constant(x1)
s_squared = np.sum(residuals**2) / (len(y) - 2)
s = np.sqrt(s_squared)
xbar = np.mean(x)
Sxx = np.sum((x - xbar)**2)
n = len(y)
tval = t.ppf(0.975, df=n-2)

modTLR = 33.476216 + -0.006982 * x1
CIupperModel = modTLR + tval * s * np.sqrt((1/n) + (x1 - xbar)**2 / Sxx)
CIlowerModel = modTLR - tval * s * np.sqrt((1/n) + (x1 - xbar)**2 / Sxx)
CIupperResponse = modTLR + tval * s * np.sqrt(1 + (1/n) + (x1 - xbar)**2 / Sxx)
CIlowerResponse = modTLR - tval * s * np.sqrt(1 + (1/n) + (x1 - xbar)**2 / Sxx)

#Fig 4.3
plt.figure(figsize=(10, 6))
plt.scatter(x, y)
plt.plot(x1, CIupperModel, 'r')
plt.plot(x1, CIlowerModel, 'r')
plt.plot(x1, CIupperResponse, 'b')
plt.plot(x1, CIlowerResponse, 'b')
plt.plot(x, model.predict(X), 'k', linewidth=3)
plt.xlabel("Elevation [m]", fontsize=14)
plt.ylabel("Temperature [°C]", fontsize=14)
plt.title("Colorado Elevation and July Tmean: 1981-2010 Average", fontsize=14)
plt.text(2280, 26, "Temperature lapse rate: 7.0 °C/km", fontsize=12)
plt.text(2350, 27.5, "R-squared = 0.96", fontsize=12)
plt.text(2350, 29, "y = 33.48 - 0.0070 x", fontsize=12)
plt.text(1600, 15, "Blue lines: CI of July Tmean RV", color='blue', fontsize=12)
plt.text(1600, 13.5, "Red lines: CI of the fitted model", color='red', fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()

#Fig 4.4
plt.figure(figsize=(10, 6))
plt.scatter(x, residuals, marker='x')
plt.xlabel("Elevation [m]", fontsize=14)
plt.ylabel("Residual Temp [°C]", fontsize=14)
plt.title("Residuals of the Colorado 1981-2010 July Tmean vs. Elevation", fontsize=14)
plt.axhline(0, color='gray', linestyle='--')
plt.grid(True)
plt.tight_layout()
plt.show()

#Fig 4.5
plt.figure(figsize=(6, 6))
qqplot(residuals, line='45', fit=True)
plt.title("QQ-Normal Plot for the Colorado TLR Residuals")
plt.tight_layout()
plt.show()

#Durbin-Watson test
print("Durbin-Watson:", durbin_watson(residuals))

#Mann-Kendall test
ElevTemp = np.column_stack((x, y))
ElevTemp = ElevTemp[np.argsort(ElevTemp[:, 0])]
residuals_sorted = sm.OLS(ElevTemp[:, 1], sm.add_constant(ElevTemp[:, 0])).fit().resid
tau, p_value_resid = kendalltau(range(len(residuals_sorted)), residuals_sorted)
tau_y, p_value_y = kendalltau(range(len(ElevTemp[:, 1])), ElevTemp[:, 1])
print("Mann-Kendall test on residuals: tau =", tau, ", p =", p_value_resid)
print("Mann-Kendall test on y: tau =", tau_y, ", p =", p_value_y)

#Multivariate regression
lat = np.array([39.9919, 38.4600, 39.2203, 38.8236, 39.2425, 37.6742,
      39.6261, 38.4775, 40.6147, 40.2600, 39.1653, 38.5258,
      37.7717, 38.0494, 38.0936, 38.0636, 37.1742, 38.4858,
      38.0392, 38.0858, 40.4883, 37.9492, 37.1786, 40.0583])
lon = np.array([-105.2667, -105.2256, -105.2783, -102.3486, -107.9631, -106.3247,
  -106.0353, -102.7808, -105.1314, -103.8156, -108.7331, -106.9675,
  -107.1097, -102.1236, -102.6306, -103.2153, -105.9392, -107.8792,
  -103.6933, -106.1444, -106.8233, -107.8733, -104.4869, -102.2189])
elev = x; temp = y
df = pd.DataFrame({'lat': lat, 'lon': lon, 'elev': elev, 'temp': temp})
X_multi = sm.add_constant(df[['lat', 'lon', 'elev']])
model_multi = sm.OLS(df['temp'], X_multi).fit()
print(model_multi.summary())
print("Coefficients:", model_multi.params.round(5))
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.963
Model:                            OLS   Adj. R-squared:                  0.961
Method:                 Least Squares   F-statistic:                     574.5
Date:                Sun, 11 May 2025   Prob (F-statistic):           2.94e-17
Time:                        05:59:06   Log-Likelihood:                -27.071
No. Observations:                  24   AIC:                             58.14
Df Residuals:                      22   BIC:                             60.50
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         33.4763      0.546     61.309      0.000      32.344      34.609
x1            -0.0070      0.000    -23.969      0.000      -0.008      -0.006
==============================================================================
Omnibus:                        0.891   Durbin-Watson:                   2.027
Prob(Omnibus):                  0.640   Jarque-Bera (JB):                0.514
Skew:                           0.355   Prob(JB):                        0.773
Kurtosis:                       2.898   Cond. No.                     6.42e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.42e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
Rounded residuals: [ 0.25792  1.53427 -0.37129 -0.43697 -0.10542  0.43358 -0.60335  0.12833
 -0.64953 -0.19817  0.10302 -0.6388  -0.63366 -0.61692 -0.13283  0.63012
  0.51454  1.27623 -0.06333  0.27628 -1.52923  0.39122  1.52971 -1.09572]
Mean of residuals: -2.5905203907920318e-15
Sum of xa * residuals: 3.012701199622825e-11
Residual variance: 0.6096193452251237
No description has been provided for this image
Slope (projection method): -0.0069818845648987665
Slope (correlation method): -0.006981884564898769
Variance of fitted values: 15.227212262175986
Variance of y: 15.810326418478262
R-squared: 0.9631181456430423
Correlation^2: 0.9631181456430413
No description has been provided for this image
No description has been provided for this image
<Figure size 600x600 with 0 Axes>
No description has been provided for this image
Durbin-Watson: 2.026779641075883
Mann-Kendall test on residuals: tau = 0.07246376811594202 , p = 0.6409517533780805
Mann-Kendall test on y: tau = -0.8768115942028986 , p = 2.0944279884835788e-13
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   temp   R-squared:                       0.979
Model:                            OLS   Adj. R-squared:                  0.976
Method:                 Least Squares   F-statistic:                     311.1
Date:                Sun, 11 May 2025   Prob (F-statistic):           6.05e-17
Time:                        05:59:07   Log-Likelihood:                -20.300
No. Observations:                  24   AIC:                             48.60
Df Residuals:                      20   BIC:                             53.31
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         36.4400      9.436      3.862      0.001      16.758      56.122
lat           -0.4925      0.132     -3.731      0.001      -0.768      -0.217
lon           -0.1631      0.089     -1.834      0.082      -0.349       0.022
elev          -0.0076      0.000    -22.953      0.000      -0.008      -0.007
==============================================================================
Omnibus:                        0.854   Durbin-Watson:                   1.336
Prob(Omnibus):                  0.652   Jarque-Bera (JB):                0.780
Skew:                           0.391   Prob(JB):                        0.677
Kurtosis:                       2.588   Cond. No.                     1.41e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.41e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
Coefficients: const    36.43996
lat      -0.49251
lon      -0.16308
elev     -0.00757
dtype: float64

More regression diagnostics

In [ ]:
#Fig. 4.6 and make regression diagnostics

df = pd.read_csv("/content/drive/MyDrive/LinAlg/data/aravg.ann.land_ocean.90S.90N.v4.0.1.201907.txt",
                 delim_whitespace=True, header=None)
x = df.iloc[:139, 0].values
y = df.iloc[:139, 1].values

#Fit linear model
X = sm.add_constant(x)
model = sm.OLS(y, X).fit()
print(model.summary())

#Residuals and standard error
residuals = model.resid
SSE = np.sum(residuals**2)
s_squared = SSE / (len(y) - 2)
s = np.sqrt(s_squared)

#Model predictions and confidence intervals
modT = model.params[0] + model.params[1] * x
xbar = np.mean(x)
Sxx = np.sum((x - xbar) ** 2)
n = len(y)
tval_full = t.ppf(0.975, df=n-2)
tval_reduced = t.ppf(0.975, df=5)

CI_model = tval_full * s * np.sqrt((1/n) + (x - xbar)**2 / Sxx)
CI_response = tval_full * s * np.sqrt(1 + (1/n) + (x - xbar)**2 / Sxx)

CI_model_r = tval_reduced * s * np.sqrt((1/n) + (x - xbar)**2 / Sxx)
CI_response_r = tval_reduced * s * np.sqrt(1 + (1/n) + (x - xbar)**2 / Sxx)

#Plotting
fig, axs = plt.subplots(2, 1, figsize=(10, 8), gridspec_kw={'height_ratios': [2, 1]})
axs[0].plot(x, y, 'o-', label='Data')
axs[0].plot(x, modT, 'k-', label='Regression line', linewidth=3)
axs[0].plot(x, modT + CI_model, 'r-', label='95% CI model')
axs[0].plot(x, modT - CI_model, 'r-')
axs[0].plot(x, modT + CI_response, 'b-', label='95% CI response')
axs[0].plot(x, modT - CI_response, 'b-')
axs[0].plot(x, modT + CI_model_r, 'r--')
axs[0].plot(x, modT - CI_model_r, 'r--')
axs[0].plot(x, modT + CI_response_r, 'b--')
axs[0].plot(x, modT - CI_response_r, 'b--')
axs[0].set_ylim([-1.5, 1])
axs[0].set_ylabel("Temperature [°C]")
axs[0].set_title("Global Annual Mean Surface Temperature Anomalies")
axs[0].legend()
axs[0].text(1940, 0.5, "Linear trend: 0.7348 °C per century")
axs[0].text(1880, 0.9, "(a)")

axs[1].plot(x, residuals, marker='o', linestyle='none')
axs[1].axhline(0, color='gray', linestyle='--')
axs[1].set_ylim([-0.6, 0.6])
axs[1].set_xlabel("Year")
axs[1].set_ylabel("Residuals [°C]")
axs[1].text(1880, 0.5, "(b)")

plt.tight_layout()
plt.show()

#KS Test for normality
resi_std = np.std(residuals)
resi_mean = np.mean(residuals)
norm_sample = np.random.normal(loc=resi_mean, scale=resi_std, size=len(residuals))
ks_stat, ks_pvalue = ks_2samp(residuals, norm_sample)
print("KS test p-value:", ks_pvalue) #The normality assumption is accepted

#Durbin-Watson Test to check independence
dw_stat = durbin_watson(residuals)
print("Durbin-Watson statistic:", dw_stat) #The independence assumption is rejected

#Autocorrelation
acf_vals = acf(y, nlags=1)
rho1 = acf_vals[1]
print("Lag-1 autocorrelation (rho1):", rho1)

#Effective degrees of freedom
edof = (n - 2) * (1 - rho1) / (1 + rho1)
print("Effective degrees of freedom (EDOF):", edof)

#Q-Q Plot
sm.qqplot(residuals, line='s')
plt.title("QQ-Normal Plot for the NOAAGlobalTemp: 1880-2018")
plt.show()

#Assume x and y are already defined from previous steps
x1 = x
y1 = y
n = len(y1)

#Autocorrelation
acf_vals = acf(y1, nlags=1)
print("Lag-1 autocorrelation:", acf_vals[1])

#Lag-1 correlation manually
lag1_corr = np.corrcoef(y1[:-1], y1[1:])[0, 1]
print("Lag-1 correlation:", lag1_corr)

#Linear regression
X1 = sm.add_constant(x1)
reg8018 = sm.OLS(y1, X1).fit()
residuals = reg8018.resid

#Confidence intervals
SSE = np.sum(residuals ** 2)
s_squared = SSE / (n - 2)
s = np.sqrt(s_squared)
modT = -14.574841 + 0.007348 * x1
xbar = np.mean(x1)
Sxx = np.sum((x1 - xbar) ** 2)

tval = t.ppf(0.975, df=n-2)
CI_model = tval * s * np.sqrt((1/n) + (x1 - xbar)**2 / Sxx)
CI_response = tval * s * np.sqrt(1 + (1/n) + (x1 - xbar)**2 / Sxx)

#Plotting the regression and confidence intervals
plt.figure(figsize=(10, 6))
plt.plot(x1, y1, 'o', label='Data')
plt.plot(x1, modT, 'k-', label='Regression Line')
plt.plot(x1, modT + CI_model, 'r-', label='95% CI (Model)')
plt.plot(x1, modT - CI_model, 'r-')
plt.plot(x1, modT + CI_response, 'b-', label='95% CI (Response)')
plt.plot(x1, modT - CI_response, 'b-')
plt.xlabel("Elevation [m]")
plt.ylabel("Temperature [°C]")
plt.title("Colorado Elevation and July Tmean: 1981-2010 Average")
plt.legend()
plt.show()

#Q-Q plot
sm.qqplot(residuals, line='s')
plt.title("QQ-Normal Plot for the NOAAGlobalTemp: 1880-2018")
plt.ylim([-0.4, 0.4])
plt.xlim([-3, 3])
plt.show()

#KS test for normality
resi_mean = np.mean(residuals)
resi_sd = np.std(residuals)
norm_sample = np.random.normal(loc=resi_mean, scale=resi_sd, size=n)
ks_stat, ks_pvalue = ks_2samp(residuals, norm_sample)
print("KS test p-value:", ks_pvalue) #Conclusion: Normal distribution is not rejected.

#Durbin-Watson test
dw_stat = durbin_watson(residuals)
print("Durbin-Watson statistic:", dw_stat) #Conclusion: There is a significant serial correlation.

#Additional autocorrelation
acf_full = acf(y1)
print("Full ACF lag 1:", acf_full[1])

#Correlation with self (should be 1)
print("Correlation of y with itself:", np.corrcoef(y1, y1)[0, 1])
<ipython-input-3-2e68aa9e6b05>:3: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\s+'`` instead
  df = pd.read_csv("/content/drive/MyDrive/LinAlg/data/aravg.ann.land_ocean.90S.90N.v4.0.1.201907.txt",
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.758
Model:                            OLS   Adj. R-squared:                  0.757
Method:                 Least Squares   F-statistic:                     429.8
Date:                Sun, 11 May 2025   Prob (F-statistic):           4.43e-44
Time:                        05:59:17   Log-Likelihood:                 58.294
No. Observations:                 139   AIC:                            -112.6
Df Residuals:                     137   BIC:                            -106.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -13.8729      0.660    -21.009      0.000     -15.179     -12.567
x1             0.0070      0.000     20.732      0.000       0.006       0.008
==============================================================================
Omnibus:                        2.965   Durbin-Watson:                   0.405
Prob(Omnibus):                  0.227   Jarque-Bera (JB):                2.650
Skew:                           0.248   Prob(JB):                        0.266
Kurtosis:                       2.540   Cond. No.                     9.47e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.47e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
KS test p-value: 0.7787330430200653
Durbin-Watson statistic: 0.4048370785614146
Lag-1 autocorrelation (rho1): 0.9317402443446575
Effective degrees of freedom (EDOF): 4.84101656636265
No description has been provided for this image
Lag-1 autocorrelation: 0.9317402443446575
Lag-1 correlation: 0.9502564870442989
No description has been provided for this image
No description has been provided for this image
KS test p-value: 0.25018967168860684
Durbin-Watson statistic: 0.4048370785614146
Full ACF lag 1: 0.9317402443446575
Correlation of y with itself: 1.0